In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import shap        
        
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker

from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from mpl_toolkits.axes_grid1 import make_axes_locatable

sns.set_context("paper")
shap.initjs()
/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv
No description has been provided for this image
In [2]:
houses = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
print(f"Rows: {houses.shape[0]:,}")
print(f"Cols: {houses.shape[1]:,}")
houses.head(3)
Rows: 1,460
Cols: 81
Out[2]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500

3 rows × 81 columns

In [3]:
RANDOM_STATE = 42

General Overview¶

In [4]:
dtype_groups = houses.columns.to_series().groupby(houses.dtypes).apply(list)

int_cols, float_cols, object_cols = [], [], []

for dtype, columns in dtype_groups.items():
    if dtype == 'int64': int_cols = columns
    elif dtype =='float64': float_cols = columns
    else: object_cols = columns
        
    print(f"There are {len(columns):3} columns of type {dtype}:")
    print(columns)
    print("\n")
There are  35 columns of type int64:
['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']


There are   3 columns of type float64:
['LotFrontage', 'MasVnrArea', 'GarageYrBlt']


There are  43 columns of type object:
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [5]:
print(houses.dtypes)
Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object
In [6]:
ID = 'Id'
TARGET = 'SalePrice'

Numerical Variables¶

In [7]:
num_features = int_cols + float_cols
num_features = [x for x in num_features if x not in [ID, TARGET]]
In [8]:
temp = houses[num_features]

stats = pd.DataFrame({
    'Variable Name': temp.columns,
    'Missing Count': temp.isnull().sum().values,
    'Missing Rate (%)': 100*temp.isnull().mean().values,
    'Unique Count': temp.nunique().values,
    'Unique Rate (%)': 100*temp.nunique().values / len(temp)
}).sort_values(['Missing Rate (%)'], ascending=False)
stats
Out[8]:
Variable Name Missing Count Missing Rate (%) Unique Count Unique Rate (%)
33 LotFrontage 259 17.739726 110 7.534247
35 GarageYrBlt 81 5.547945 97 6.643836
34 MasVnrArea 8 0.547945 327 22.397260
26 EnclosedPorch 0 0.000000 120 8.219178
20 TotRmsAbvGrd 0 0.000000 12 0.821918
21 Fireplaces 0 0.000000 4 0.273973
22 GarageCars 0 0.000000 5 0.342466
23 GarageArea 0 0.000000 441 30.205479
24 WoodDeckSF 0 0.000000 274 18.767123
25 OpenPorchSF 0 0.000000 202 13.835616
27 3SsnPorch 0 0.000000 20 1.369863
1 LotArea 0 0.000000 1073 73.493151
28 ScreenPorch 0 0.000000 76 5.205479
29 PoolArea 0 0.000000 8 0.547945
30 MiscVal 0 0.000000 21 1.438356
31 MoSold 0 0.000000 12 0.821918
32 YrSold 0 0.000000 5 0.342466
19 KitchenAbvGr 0 0.000000 4 0.273973
0 MSSubClass 0 0.000000 15 1.027397
17 HalfBath 0 0.000000 3 0.205479
8 BsmtUnfSF 0 0.000000 780 53.424658
2 OverallQual 0 0.000000 10 0.684932
3 OverallCond 0 0.000000 9 0.616438
4 YearBuilt 0 0.000000 112 7.671233
5 YearRemodAdd 0 0.000000 61 4.178082
6 BsmtFinSF1 0 0.000000 637 43.630137
7 BsmtFinSF2 0 0.000000 144 9.863014
9 TotalBsmtSF 0 0.000000 721 49.383562
16 FullBath 0 0.000000 4 0.273973
10 1stFlrSF 0 0.000000 753 51.575342
11 2ndFlrSF 0 0.000000 417 28.561644
12 LowQualFinSF 0 0.000000 24 1.643836
13 GrLivArea 0 0.000000 861 58.972603
14 BsmtFullBath 0 0.000000 4 0.273973
15 BsmtHalfBath 0 0.000000 3 0.205479
18 BedroomAbvGr 0 0.000000 8 0.547945
In [9]:
plt.figure(figsize=(8, 4))

plt.hist(houses[TARGET], bins=50, edgecolor='white', linewidth=1.5, color='#4682B4', alpha=0.7);
plt.xlabel(TARGET)
plt.title("Distribution of Target SalePrice", fontsize=12)

ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: f'{int(y/1_000)}k'))

plt.show()
No description has been provided for this image
In [10]:
fig, axes = plt.subplots(nrows=8, ncols=5, figsize=(15, 18))

color = '#4682B4'
axes = axes.flatten()

for index, col in enumerate(num_features):
    ax = axes[index]
    ax.hist(houses[col], color=color, bins=20, edgecolor='white', alpha=0.7)
    ax.set_title(f"{col}", fontsize=12)
    ax.set_ylabel("Frequency", fontsize=10)
    ax.grid(True, linestyle='--', alpha=0.6)

for i in range(len(num_features), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout(rect=[0, 0, 1, 0.98])
plt.show()
No description has been provided for this image
In [11]:
plt.figure(figsize=(25, 15))

sns.heatmap(houses[num_features].corr(), annot=True, cmap='coolwarm', linewidths=0.5, fmt='.2f')

plt.title('Data Correlation Heatmap (Numerical Variables)', fontsize=12)
plt.show()
No description has been provided for this image

Categorical Variables¶

In [12]:
temp = houses[object_cols].copy()

stats = pd.DataFrame({
    'Variable Name': temp.columns,
    'Missing Count': temp.isnull().sum().values,
    'Missing Rate (%)': 100*temp.isnull().mean().values,
    'Unique Count': temp.nunique().values,
    'Unique Rate (%)': 100*temp.nunique().values / len(temp)
}).sort_values(['Missing Rate (%)'], ascending=False)
stats
Out[12]:
Variable Name Missing Count Missing Rate (%) Unique Count Unique Rate (%)
38 PoolQC 1453 99.520548 3 0.205479
40 MiscFeature 1406 96.301370 4 0.273973
2 Alley 1369 93.767123 2 0.136986
39 Fence 1179 80.753425 4 0.273973
17 MasVnrType 872 59.726027 3 0.205479
32 FireplaceQu 690 47.260274 5 0.342466
33 GarageType 81 5.547945 6 0.410959
36 GarageCond 81 5.547945 5 0.342466
35 GarageQual 81 5.547945 5 0.342466
34 GarageFinish 81 5.547945 3 0.205479
25 BsmtFinType2 38 2.602740 6 0.410959
23 BsmtExposure 38 2.602740 4 0.273973
24 BsmtFinType1 37 2.534247 6 0.410959
21 BsmtQual 37 2.534247 4 0.273973
22 BsmtCond 37 2.534247 4 0.273973
29 Electrical 1 0.068493 5 0.342466
30 KitchenQual 0 0.000000 4 0.273973
28 CentralAir 0 0.000000 2 0.136986
31 Functional 0 0.000000 7 0.479452
27 HeatingQC 0 0.000000 5 0.342466
26 Heating 0 0.000000 6 0.410959
37 PavedDrive 0 0.000000 3 0.205479
41 SaleType 0 0.000000 9 0.616438
0 MSZoning 0 0.000000 5 0.342466
1 Street 0 0.000000 2 0.136986
10 Condition2 0 0.000000 8 0.547945
3 LotShape 0 0.000000 4 0.273973
4 LandContour 0 0.000000 4 0.273973
5 Utilities 0 0.000000 2 0.136986
6 LotConfig 0 0.000000 5 0.342466
7 LandSlope 0 0.000000 3 0.205479
8 Neighborhood 0 0.000000 25 1.712329
9 Condition1 0 0.000000 9 0.616438
11 BldgType 0 0.000000 5 0.342466
20 Foundation 0 0.000000 6 0.410959
12 HouseStyle 0 0.000000 8 0.547945
13 RoofStyle 0 0.000000 6 0.410959
14 RoofMatl 0 0.000000 8 0.547945
15 Exterior1st 0 0.000000 15 1.027397
16 Exterior2nd 0 0.000000 16 1.095890
18 ExterQual 0 0.000000 4 0.273973
19 ExterCond 0 0.000000 5 0.342466
42 SaleCondition 0 0.000000 6 0.410959

There are quiet some variables with missing values. We need to handle these before we apply label encoding.

In [13]:
def handle_missing_values(data, object_cols, strategy='fill_missing'):
    """Handle missing values in categorical features."""
    for col in object_cols:
        if strategy == 'fill_missing':
            data[col] = data[col].fillna('Missing')
        elif strategy == 'most_frequent':
            most_frequent = data[col].mode()[0]
            data[col] = data[col].fillna(most_frequent)
    return data
In [14]:
houses = handle_missing_values(houses, object_cols, strategy='fill_missing')
In [15]:
num_cols = len(object_cols)
num_plots = 5 

for i in range(0, num_cols, num_plots):
    fig, axes = plt.subplots(1, num_plots, figsize=(18, 3.5))

    for j, col in enumerate(object_cols[i:i+num_plots]):
        ax = axes[j]

        sns.boxplot(x=houses[col], y=houses[TARGET], ax=ax, linewidth=2, fliersize=3, palette='viridis')

        ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha='center')
        ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: f'{int(y/1_000)}k'))

        ax.set_title(f"{TARGET} by {col}", fontsize=12)
        ax.set_ylabel(TARGET, fontsize=10)

    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [16]:
def label_encode_features(data, object_cols):
    """Label encode categorical features and add them as new columns with prefix 'le_'."""

    label_encoders = {}
    encoded_features = []
    
    for col in object_cols:
        label_encoders[col] = LabelEncoder()
        encoded_col_name = f'le_{col}'
        data[encoded_col_name] = label_encoders[col].fit_transform(data[col].astype(str))
        
        encoded_features.append(encoded_col_name)
        
    return {
        'data': data,
        'encoded_features': encoded_features,
        'label_encoder': label_encoders
    }

le_result = label_encode_features(houses, object_cols)

houses = le_result.get('data').copy()
houses[le_result.get('encoded_features')].head(3)
Out[16]:
le_MSZoning le_Street le_Alley le_LotShape le_LandContour le_Utilities le_LotConfig le_LandSlope le_Neighborhood le_Condition1 ... le_GarageType le_GarageFinish le_GarageQual le_GarageCond le_PavedDrive le_PoolQC le_Fence le_MiscFeature le_SaleType le_SaleCondition
0 3 1 1 3 3 0 4 0 5 2 ... 1 2 5 5 2 3 2 1 8 4
1 3 1 1 3 3 0 2 0 24 1 ... 1 2 5 5 2 3 2 1 8 4
2 3 1 1 0 3 0 4 0 5 2 ... 1 2 5 5 2 3 2 1 8 4

3 rows × 43 columns

In [17]:
cat_features = le_result.get('encoded_features')

Feature Engineering¶

In [18]:
houses['GrLivArea_OverallQual'] = houses['GrLivArea'] * houses['OverallQual']

Preparation¶

In the following we are going to call the "classical" test set out-of-bag (OOB in short). The reason for this is simply that the "actual" test set is the one which we will submit for this competition.

In [19]:
features = num_features + cat_features
features = [x for x in features if x not in [ID, TARGET]]
In [20]:
X = houses[features]
# y = houses[TARGET]
y = np.log1p(houses[TARGET])

X_train, X_oob, y_train, y_oob = train_test_split(X, y, 
                                                  test_size=0.2,  # In our case, this is the Out-Of-Bag (OOB) 
                                                  random_state=RANDOM_STATE)
In [21]:
# preprocessing_pipeline = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
#     ('scaler', StandardScaler())                  # Scale numeric features
# ])

# X_train_scaled = preprocessing_pipeline.fit_transform(X_train)
# X_oob_scaled = preprocessing_pipeline.transform(X_oob)
In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

numerical_cols = num_features
categorical_cols = cat_features

# --- Preprocessing pipeline ---
preprocessor = ColumnTransformer(
    transformers=[
        # 
        # Apply imputation and scaling to numerical columns.
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),  
            ('scaler', StandardScaler())                  
        ]), num_features),
        #
        # No scaling needed for the categorical columns.
        ('cat', 'passthrough', cat_features)
    ]
)

X_train_scaled = preprocessor.fit_transform(X_train)
X_oob_scaled = preprocessor.transform(X_oob)

Modeling¶

In [23]:
USE_HYPERPARAMETER_OPTIMIZATION = False

With Hyperparameter Optimization¶

In [24]:
PARAMETER_COMBINATIONS = 50

# Track RMSE per fold.
train_rmse_per_fold = []
val_rmse_per_fold = []

# Track predictions per fold.
train_predictions = []
val_predictions = []

# Track RMSE per epoch per fold.
train_rmse_per_epoch = []
val_rmse_per_epoch = []    

param_grid = {
    'n_estimators': [300, 500, 1_000], # Epochs
    'max_depth': [3, 5, 7],
    'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'reg_lambda': [0.01, 0.1, 1, 10, 20],  # L2 regularization
    'reg_alpha': [0, 0.01, 0.1, 0.5],      # L1 regularization
    'gamma': [0.1, 0.3, 0.5],              # Regularize tree splits   
}

xgb_model = XGBRegressor(early_stopping_rounds=10, 
                         n_jobs=-1)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=3,
    n_iter=PARAMETER_COMBINATIONS,
    verbose=1,
    random_state=RANDOM_STATE,
    n_jobs=-1
)
In [25]:
%%time 

random_search.fit(X_train_scaled, y_train, 
                  eval_set=[(X_oob_scaled, y_oob)],
                  verbose=0)
Fitting 3 folds for each of 50 candidates, totalling 150 fits
CPU times: user 1.36 s, sys: 252 ms, total: 1.61 s
Wall time: 53.3 s
Out[25]:
RandomizedSearchCV(cv=3,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          callbacks=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, device=None,
                                          early_stopping_rounds=10,
                                          enable_categorical=False,
                                          eval_metric=None, feature_types=None,
                                          gamma=None, grow_policy=None,
                                          importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=No...
                                          random_state=None, ...),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.8, 1],
                                        'gamma': [0.1, 0.3, 0.5],
                                        'learning_rate': [0.001, 0.005, 0.01,
                                                          0.05, 0.1],
                                        'max_depth': [3, 5, 7],
                                        'n_estimators': [300, 500, 1000],
                                        'reg_alpha': [0, 0.01, 0.1, 0.5],
                                        'reg_lambda': [0.01, 0.1, 1, 10, 20],
                                        'subsample': [0.8, 1]},
                   random_state=42, scoring='neg_root_mean_squared_error',
                   verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=3,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          callbacks=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, device=None,
                                          early_stopping_rounds=10,
                                          enable_categorical=False,
                                          eval_metric=None, feature_types=None,
                                          gamma=None, grow_policy=None,
                                          importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=No...
                                          random_state=None, ...),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.8, 1],
                                        'gamma': [0.1, 0.3, 0.5],
                                        'learning_rate': [0.001, 0.005, 0.01,
                                                          0.05, 0.1],
                                        'max_depth': [3, 5, 7],
                                        'n_estimators': [300, 500, 1000],
                                        'reg_alpha': [0, 0.01, 0.1, 0.5],
                                        'reg_lambda': [0.01, 0.1, 1, 10, 20],
                                        'subsample': [0.8, 1]},
                   random_state=42, scoring='neg_root_mean_squared_error',
                   verbose=1)
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=10,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=-1,
             num_parallel_tree=None, random_state=None, ...)
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=10,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=-1,
             num_parallel_tree=None, random_state=None, ...)
In [26]:
random_search.best_params_
Out[26]:
{'subsample': 0.8,
 'reg_lambda': 1,
 'reg_alpha': 0.1,
 'n_estimators': 1000,
 'max_depth': 3,
 'learning_rate': 0.05,
 'gamma': 0.1,
 'colsample_bytree': 0.8}
In [27]:
best_model = random_search.best_estimator_

best_model.fit(X_train_scaled, y_train, 
               eval_set=[(X_train_scaled, y_train), (X_oob_scaled, y_oob)], 
               eval_metric='rmse', 
               verbose=False)

evals_result = best_model.evals_result()

train_rmse_per_epoch = evals_result['validation_0']['rmse']  # RMSE for training set
oob_rmse_per_epoch = evals_result['validation_1']['rmse']  # RMSE for OOB set

train_predictions = best_model.predict(X_train_scaled)  # Training set predictions
oob_predictions = best_model.predict(X_oob_scaled)      # OOB set predictions

# Reverse the log(1 + y) transformation.
train_predictions = np.expm1(train_predictions)
oob_predictions = np.expm1(oob_predictions)
`eval_metric` in `fit` method is deprecated for better compatibility with scikit-learn, use `eval_metric` in constructor or`set_params` instead.
In [28]:
train_rmse = np.sqrt(mean_squared_error(np.expm1(y_train), train_predictions))
oob_rmse = np.sqrt(mean_squared_error(np.expm1(y_oob), oob_predictions))

print(f"Final Train RMSE: {train_rmse:,.0f}")
print(f"Final OOB RMSE  : {oob_rmse:,.0f}")
Final Train RMSE: 19,103
Final OOB RMSE  : 28,313
In [29]:
# -------------------------------------- #
# Plot RMSE per epoch for the best model #
# -------------------------------------- #

plt.figure(figsize=(6, 4))
plt.plot(range(1, len(train_rmse_per_epoch) + 1), train_rmse_per_epoch, 
         label='Train RMSE', linestyle='-', lw=2, color='blue')
plt.plot(range(1, len(oob_rmse_per_epoch) + 1), oob_rmse_per_epoch, 
         label='OOB RMSE', linestyle='-.', lw=2, color='orange')

plt.title('Train and OOB RMSE per Epoch (Best Parameter Combination)')
plt.xlabel('Boosting Round / Epoch')
plt.ylabel('RMSE')
plt.legend()

plt.tight_layout()
plt.show()
No description has been provided for this image
In [30]:
%%time

# --------------------------------------------- #
# Plot RMSE for each hyperparameter combination #
# --------------------------------------------- #

train_rmse_per_iter = []
oob_rmse_per_iter = []

for i in range(len(random_search.cv_results_['params'])):
    _model = XGBRegressor(**random_search.cv_results_['params'][i])
    
    _model.fit(X_train, y_train, 
               eval_set=[(X_oob, y_oob)],
               verbose=0)
    
    y_train_pred = _model.predict(X_train)
    y_oob_pred = _model.predict(X_oob)

    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    oob_rmse = np.sqrt(mean_squared_error(y_oob, y_oob_pred))
    
    train_rmse_per_iter.append(train_rmse)
    oob_rmse_per_iter.append(oob_rmse)

plt.figure(figsize=(6, 4))
plt.plot(range(1, len(train_rmse_per_iter) + 1), train_rmse_per_iter, label='Train RMSE', marker='o', markersize=2, color='blue')
plt.plot(range(1, len(oob_rmse_per_iter) + 1), oob_rmse_per_iter, label='OOB RMSE', marker='x', markersize=2, color='orange')

plt.title('Train and OOB RMSE per Parameter Combination')
plt.xlabel('Parameter Combination')
plt.ylabel('RMSE')
plt.legend()

plt.tight_layout()
plt.show()
No description has been provided for this image
CPU times: user 1min 59s, sys: 1.42 s, total: 2min
Wall time: 1min 1s

Without Hyperparameter Optimization¶

In [31]:
xgb_model = XGBRegressor(
    objective='reg:squarederror',  # Required optimization metric for this competition.
    random_state=RANDOM_STATE,
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=2.0,
    early_stopping_rounds=50
)
In [32]:
kf = KFold(n_splits=5, 
           shuffle=True, 
           random_state=RANDOM_STATE)

# Track RMSE per fold.
train_rmse_per_fold = []
val_rmse_per_fold = []

# Track predictions per fold.
train_predictions = []
val_predictions = []

# KFold Cross-Validation.
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled)):
    X_train_cv, X_val_cv = X_train_scaled[train_idx], X_train_scaled[val_idx]
    y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    xgb_model.fit(X_train_cv, y_train_cv, eval_set=[(X_val_cv, y_val_cv)], verbose=False)
    
    # Predict on training and validation.
    y_train_pred = xgb_model.predict(X_train_cv)
    y_val_pred = xgb_model.predict(X_val_cv)
    
    y_train_pred = np.expm1(y_train_pred)
    y_val_pred = np.expm1(y_val_pred)    
    
    # Track predictions.
    train_predictions.append(y_train_pred)
    val_predictions.append(y_val_pred)
    
    # Calculate RMSE.
    train_rmse = np.sqrt(mean_squared_error(np.expm1(y_train_cv), y_train_pred))
    val_rmse = np.sqrt(mean_squared_error(np.expm1(y_val_cv), y_val_pred))
    
    # Track RMSE.
    train_rmse_per_fold.append(train_rmse)
    val_rmse_per_fold.append(val_rmse)
    
    print(f"Fold {fold+1} -> Train RMSE: {train_rmse:.4f}, Validation RMSE: {val_rmse:.4f}")
Fold 1 -> Train RMSE: 3029.7441, Validation RMSE: 30521.5768
Fold 2 -> Train RMSE: 4552.0018, Validation RMSE: 32904.1754
Fold 3 -> Train RMSE: 3040.2980, Validation RMSE: 30577.1073
Fold 4 -> Train RMSE: 6313.2936, Validation RMSE: 23972.4075
Fold 5 -> Train RMSE: 6781.1205, Validation RMSE: 24642.8847
In [33]:
train_mu, train_sd = np.mean(train_rmse_per_fold), np.std(train_rmse_per_fold)
val_mu, val_sd = np.mean(val_rmse_per_fold), np.std(val_rmse_per_fold)
print(f"Training   RMSE: {train_mu:>7,.0f} +- {train_sd:>6,.0f}")
print(f"Validation RMSE: {val_mu:>7,.0f} +- {val_sd:>6,.0f}")

y_oob_pred = xgb_model.predict(X_oob_scaled)
oob_rmse = np.sqrt(mean_squared_error(np.expm1(y_oob), 
                                      np.expm1(y_oob_pred)))
print(f"Final OOB RMSE : {oob_rmse:7,.0f}")
Training   RMSE:   4,743 +-  1,581
Validation RMSE:  28,524 +-  3,554
Final OOB RMSE :  28,011
In [34]:
TRAIN_ON_WHOLE_TRAIN = True

if TRAIN_ON_WHOLE_TRAIN: 
    X_train_total = np.concatenate([X_train_scaled, X_oob_scaled], axis=0)
    y_train_total = np.concatenate([y_train, y_oob], axis=0)
    
    xgb_model = xgb_model.fit(X_train_total, 
                              y_train_total, 
                              eval_set=[(X_oob_scaled, y_oob)], 
                              verbose=False)
    
    y_train_total_pred = xgb_model.predict(X_train_total)
    train_total_rmse = np.sqrt(mean_squared_error(np.expm1(y_train_total), 
                                          np.expm1(y_train_total_pred)))
    print(f"Total Train RMSE : {train_total_rmse:7,.0f}")
Total Train RMSE :   2,847

Stacking¶

In [35]:
# from sklearn.ensemble import StackingRegressor
# from lightgbm import LGBMRegressor

# xgb = XGBRegressor(
#     objective='reg:squarederror',  # Required optimization metric for this competition.
#     random_state=RANDOM_STATE,
#     n_estimators=500,
#     learning_rate=0.05,
#     max_depth=6,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     reg_alpha=0.1,
#     reg_lambda=2.0,
#     verbosity=0, 
#     n_jobs=-1,
# )

# estimators = [
#     ('xgb', xgb),
#     ('lgbm', LGBMRegressor(verbosity=-1, n_jobs=-1))
# ]

# stacking_regressor = StackingRegressor(estimators=estimators, 
#                                        final_estimator=XGBRegressor(n_jobs=-1),
# #                                        cv=3,
#                                        n_jobs=-1, 
#                                        passthrough=False,
#                                        verbose=0,
#                                        )
In [36]:
# %%time 

# kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# # Track RMSE per fold
# train_rmse_per_fold = []
# val_rmse_per_fold = []

# # Perform Cross-Validation
# for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_scaled)):
#     X_train_cv, X_val_cv = X_train_scaled[train_idx], X_train_scaled[val_idx]
#     y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
#     # Fit the stacking model on the current fold's training data
#     stacking_regressor.fit(X_train_cv, y_train_cv)

#     # Make predictions for the current training and validation sets
#     y_train_pred = stacking_regressor.predict(X_train_cv)
#     y_val_pred = stacking_regressor.predict(X_val_cv)
    
#     # Calculate RMSE for training and validation sets
#     train_rmse = np.sqrt(mean_squared_error(y_train_cv, y_train_pred))
#     val_rmse = np.sqrt(mean_squared_error(y_val_cv, y_val_pred))
    
#     # Track RMSE
#     train_rmse_per_fold.append(train_rmse)
#     val_rmse_per_fold.append(oob_rmse)
    
#     print(f"Fold {fold+1} -> Train RMSE: {train_rmse:,.0f}  Validation RMSE: {val_rmse:,.0f}")

# # Calculate average RMSE across all folds
# avg_train_rmse = np.mean(train_rmse_per_fold)
# avg_val_rmse = np.mean(val_rmse_per_fold)

# print()
# print(f"Average Training   RMSE: {avg_train_rmse:,.0f}")
# print(f"Average Validation RMSE: {avg_val_rmse:,.0f}")

# # Fit final model on full training data
# stacking_regressor.fit(X_train_scaled, y_train)

# # Make final predictions on OOB set
# oob_predictions = stacking_regressor.predict(X_oob_scaled)
# final_oob_rmse = np.sqrt(mean_squared_error(y_oob, oob_predictions))

# print(f"Final OOB RMSE         : {final_oob_rmse:,.0f}")

Validation¶

In [37]:
model = None

if USE_HYPERPARAMETER_OPTIMIZATION: 
    model = best_model
    train_predictions = train_predictions
    val_predictions = oob_predictions
else:
    model = xgb_model
    train_predictions = np.concatenate(train_predictions)
    val_predictions = np.concatenate(val_predictions)
In [38]:
# Evaluate on oob set.
y_oob_pred = model.predict(X_oob_scaled)
oob_rmse = np.sqrt(mean_squared_error(np.expm1(y_oob), np.expm1(y_oob_pred)))

print(f"OOB RMSE: {oob_rmse:.4f}")
OOB RMSE: 3006.8506
In [39]:
train_predictions_clean = np.where(np.isinf(train_predictions), np.nan, train_predictions)
val_predictions_clean = np.where(np.isinf(val_predictions), np.nan, val_predictions)
In [40]:
# --------------------------------------------------------- #
# Confidence Intervals for Train and Validation Predictions #
# --------------------------------------------------------- #

plt.figure(figsize=(10, 4))

sns.histplot(train_predictions_clean, kde=False, 
             color="#4C72B0", linewidth=.75, edgecolor='white', alpha=0.75, fill=True,
             label=f"Train Predictions (Mean RMSE: {train_mu:,.0f})")
sns.histplot(val_predictions_clean, kde=False, 
             color="#55A868", linewidth=.75, edgecolor='white', alpha=1.0, fill=True,
             label=f"Validation Predictions (Mean RMSE: {val_mu:,.0f})")

plt.axvline(np.mean(np.expm1(y_oob_pred)), color="red", linestyle="--", linewidth=1.75, label=f"OOB Prediction Mean RMSE: {oob_rmse:,.0f}")

plt.title("Comparing Train, Validation and OOB Prediction", fontsize=12)
plt.xlabel("Predicted SalePrice")
plt.ylabel("Density")
plt.legend()

ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: f'{int(y/1_000)}k'))

plt.show()

# Display Train/Validation RMSE per fold.
for i, (train_rmse, val_rmse) in enumerate(zip(train_rmse_per_fold, val_rmse_per_fold), 1):
    print(f"Fold {i} -> Train RMSE: {train_rmse:.4f}, Validation RMSE: {val_rmse:.4f}")
use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
No description has been provided for this image
Fold 1 -> Train RMSE: 3029.7441, Validation RMSE: 30521.5768
Fold 2 -> Train RMSE: 4552.0018, Validation RMSE: 32904.1754
Fold 3 -> Train RMSE: 3040.2980, Validation RMSE: 30577.1073
Fold 4 -> Train RMSE: 6313.2936, Validation RMSE: 23972.4075
Fold 5 -> Train RMSE: 6781.1205, Validation RMSE: 24642.8847
In [41]:
y_pred_train = model.predict(X_train_scaled)
y_pred_oob = model.predict(X_oob_scaled)
In [42]:
def plot_with_histograms(y_train, y_pred_train, y_oob, y_pred_oob):
    """Plots train and oob predictions with histograms on the right and top."""
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    # --- Train Predictions vs True Values ---
    ax = axes[0]
    ax.scatter(y_train, y_pred_train, color="#4C72B0", alpha=0.6, s=40, edgecolor='white')
    ax.plot([min(y_train), max(y_train)], [min(y_train), max(y_train)], '--', color='red')
    ax.set_xlabel("True SalePrice")
    ax.set_ylabel("Predicted SalePrice")

    # --- OOB Predictions vs True Values ---
    ax = axes[1]
    ax.scatter(y_oob, y_pred_oob, color="#55A868", alpha=0.6, s=40, edgecolor='white')
    ax.plot([min(y_oob), max(y_oob)], [min(y_oob), max(y_oob)], '--', color='red')
    ax.set_xlabel("True SalePrice")
    ax.set_ylabel("Predicted SalePrice")

    for ax in axes:
        ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))
        ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: f'{int(y/1_000)}k'))

    for i, (ax, (y_true, y_pred)) in enumerate(zip(axes, [(y_train, y_pred_train), (y_oob, y_pred_oob)])):
        divider = make_axes_locatable(ax)
        
        ax_histx = divider.append_axes("top", 0.8, pad=0.1, sharex=ax)
        ax_histx.hist(y_pred, bins=30, color='#FF6347', alpha=0.6, edgecolor='white')
        ax_histx.set_ylabel('Count')
        ax_histx.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))
        
        ax_histy = divider.append_axes("right", 0.8, pad=0.1, sharey=ax)
        ax_histy.hist(y_true, bins=30, color='#9370DB', alpha=0.6, orientation='horizontal', edgecolor='white')
        ax_histy.set_xlabel('Count')
        ax_histy.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: f'{int(y/1_000)}k'))
        
        if i == 0: 
            ax_histx.set_title('Train Predictions vs True Values', fontsize=12)
        if i == 1:
            ax_histx.set_title('OOB Predictions vs True Values', fontsize=12)
        
        plt.setp(ax_histx.get_xticklabels(), visible=False)
        plt.setp(ax_histy.get_yticklabels(), visible=False)


plot_with_histograms(np.expm1(y_train), 
                     np.expm1(y_pred_train), 
                     np.expm1(y_oob), 
                     np.expm1(y_pred_oob))
No description has been provided for this image

Let's take a look at the spread of errors.

In [43]:
def plot_residuals(y_true_train, y_pred_train, y_true_oob, y_pred_oob):
    residuals_train = y_true_train - y_pred_train
    residuals_oob = y_true_oob - y_pred_oob

    fig, axes = plt.subplots(1, 2, figsize=(10, 4))

    # --- Train Residuals ---
    axes[0].scatter(y_true_train, residuals_train, color="#4C72B0", s=40, edgecolor='white', alpha=0.6)
    axes[0].axhline(0, linestyle='--', color='red')
    axes[0].set_title("Train Residuals", fontsize=12)
    axes[0].set_xlabel("True SalePrice")
    axes[0].set_ylabel("Residuals")

    # --- oob Residuals ---
    axes[1].scatter(y_true_oob, residuals_oob, color="#55A868", s=40, edgecolor='white', alpha=0.6)
    axes[1].axhline(0, linestyle='--', color='red')
    axes[1].set_title("OOB Residuals", fontsize=12)
    axes[1].set_xlabel("True SalePrice")
    axes[1].set_ylabel("Residuals")
    
    axes[0].xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))
    axes[0].yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))    
    axes[1].xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))
    axes[1].yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x/1_000)}k'))        

    plt.tight_layout()
    plt.show()
    
plot_residuals(np.expm1(y_train), 
               np.expm1(y_pred_train), 
               np.expm1(y_oob), 
               np.expm1(y_pred_oob))
No description has been provided for this image

Feature Importances¶

In [44]:
# Calculate SHAP values.
explainer = shap.Explainer(model)
shap_values = explainer.shap_values(X_oob_scaled)
In [45]:
assert shap_values.shape[0] == X_oob.shape[0], "ERROR: Mismatch in number of samples!"
assert shap_values.shape[1] == X_oob.shape[1], "ERROR: Mismatch in number of features!"

# --- SHAP Summary Plot on Test Set Sample ---
shap.summary_plot(shap_values, X_oob_scaled, feature_names=features, plot_size=(15, 6))
No description has been provided for this image
In [46]:
shap.summary_plot(shap_values, X_oob_scaled, plot_type='bar', feature_names=features, plot_size=(6, 6))
No description has been provided for this image
In [47]:
shap.force_plot(base_value=explainer.expected_value, 
                shap_values=shap_values, 
                features=X_oob_scaled,
                feature_names=features)
Out[47]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [48]:
shap.plots.force(base_value=explainer.expected_value, 
                 shap_values=shap_values[0],
                 feature_names=features)
Out[48]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [49]:
shap.plots.force(base_value=explainer.expected_value, 
                 shap_values=shap_values[1],
                 feature_names=features)
Out[49]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [50]:
shap.plots.force(base_value=explainer.expected_value, 
                 shap_values=shap_values[2],
                 feature_names=features)
Out[50]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [51]:
explanation = explainer(X_oob)
In [52]:
shap.plots.scatter(explanation[:, 'OverallQual'], color=explanation[:, 'le_KitchenQual'],
                  dot_size=30, alpha=0.7, x_jitter=0.4, cmap=plt.get_cmap('viridis'))
No description has been provided for this image
In [53]:
shap.plots.scatter(explanation[:, explanation.abs.mean(0).argsort[-1]], color=explanation,
                   title='1st Most Important Feature',
                   dot_size=30, alpha=0.7, x_jitter=0.4, cmap=plt.get_cmap('viridis'))
No description has been provided for this image
In [54]:
shap.plots.scatter(explanation[:, explanation.abs.mean(0).argsort[-1]], color=explanation[:, 'le_KitchenQual'],
                   title='1st Most Important Feature',
                   dot_size=30, alpha=0.7, x_jitter=0.4, cmap=plt.get_cmap('viridis'))
No description has been provided for this image
In [55]:
shap.plots.scatter(explanation[:, explanation.abs.mean(0).argsort[-2]], color=plt.get_cmap('viridis')(0.0),
                   title='2nd Most Important Feature',
                   dot_size=30, alpha=0.7)
No description has been provided for this image
In [56]:
shap.plots.scatter(explanation[:, explanation.abs.mean(0).argsort[-3]], color=plt.get_cmap('viridis')(0.0),
                   title='3rd Most Important Feature', 
                   dot_size=30, alpha=0.7)
No description has been provided for this image
In [57]:
fig, ax = plt.subplots(tight_layout=True, figsize=(10, 5))

scatter = ax.scatter(
    explanation[:, explanation.abs.mean(0).argsort[-3]].data,
    explanation[:, explanation.abs.mean(0).argsort[-3]].values,
    c=explanation[:, explanation.abs.mean(0).argsort[-1]].data,
    marker="^",
    cmap=plt.get_cmap("rainbow"),
    rasterized=True,
    zorder=5,
)

cbar = plt.colorbar(scatter, aspect=50, format="%2.1f")
cbar.set_label(f"1st", fontsize=14)
cbar.outline.set_visible(False)

ax.set_title("Customization", fontsize=18)
ax.set_xlabel("2nd", fontsize=16)
ax.set_ylabel("SHAP value for\n2nd", fontsize=16)

ax.tick_params(labelsize=14)

ax.grid(linestyle="--", color="gray", linewidth=0.5, zorder=0, alpha=0.5)

plt.show()
No description has been provided for this image
In [58]:
differences = np.abs(np.expm1(y_oob_pred) - np.expm1(y_oob))

closest_index = np.argmin(differences)
farthest_index = np.argmax(differences)

shap_values_closest = explainer.shap_values(X_oob.iloc[[closest_index]])
shap_values_farthest = explainer.shap_values(X_oob.iloc[[farthest_index]])
In [59]:
plt.figure(figsize=(6,6))

shap.decision_plot(explainer.expected_value, 
                   shap_values[closest_index],
                   X_oob.iloc[closest_index].values,
                   feature_names=features,
                   highlight=0,
                   title='Decision Plot of Prediction Closted to True SalePrice',
                   auto_size_plot=False)

plt.show()
No description has been provided for this image
In [60]:
plt.figure(figsize=(6,6))

shap.decision_plot(explainer.expected_value, 
                   shap_values[farthest_index],
                   X_oob.iloc[farthest_index].values,
                   feature_names=features,
                   highlight=0,
                   title='Decision Plot of Prediction Farthest from True SalePrice',                   
                   auto_size_plot=False)

plt.show()
No description has been provided for this image

Submission¶

In [61]:
houses_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
print(f"Rows: {houses_test.shape[0]:,}")
print(f"Cols: {houses_test.shape[1]:,}")
houses_test.head(3)
Rows: 1,459
Cols: 80
Out[61]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 1461 20 RH 80.0 11622 Pave NaN Reg Lvl AllPub ... 120 0 NaN MnPrv NaN 0 6 2010 WD Normal
1 1462 20 RL 81.0 14267 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN Gar2 12500 6 2010 WD Normal
2 1463 60 RL 74.0 13830 Pave NaN IR1 Lvl AllPub ... 0 0 NaN MnPrv NaN 0 3 2010 WD Normal

3 rows × 80 columns

In [62]:
houses_test = handle_missing_values(houses_test, object_cols, strategy='fill_missing')
In [63]:
for obj_col in object_cols:
    le = le_result.get('label_encoder').get(obj_col)
    houses_test[f'le_{obj_col}'] = houses_test[obj_col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    
houses_test[le_result.get('encoded_features')].head(3)    
Out[63]:
le_MSZoning le_Street le_Alley le_LotShape le_LandContour le_Utilities le_LotConfig le_LandSlope le_Neighborhood le_Condition1 ... le_GarageType le_GarageFinish le_GarageQual le_GarageCond le_PavedDrive le_PoolQC le_Fence le_MiscFeature le_SaleType le_SaleCondition
0 2 1 1 3 3 0 4 0 12 1 ... 1 3 5 5 2 3 3 1 8 4
1 3 1 1 0 3 0 0 0 12 2 ... 1 3 5 5 2 3 2 0 8 4
2 3 1 1 0 3 0 4 0 8 2 ... 1 0 5 5 2 3 3 1 8 4

3 rows × 43 columns

In [64]:
X_sub_scaled = preprocessor.transform(houses_test[features])
In [65]:
y_sub_pred = np.expm1(model.predict(X_sub_scaled))
y_sub_pred[:10]
Out[65]:
array([122455.41, 157231.78, 178254.45, 190086.  , 187468.19, 172464.73,
       173588.95, 169919.52, 175365.38, 123989.9 ], dtype=float32)
In [66]:
plt.figure(figsize=(8, 4))

plt.hist(train_predictions_clean, bins=50, edgecolor='white', linewidth=1, color='#4C72B0', alpha=0.5, label=f"Train Predictions")
plt.hist(y_sub_pred,              bins=50, edgecolor='white', linewidth=1, color='#9370DB', alpha=1.0, label=f"Submission Predictions");
plt.xlabel(TARGET)
plt.title("Distribution of Target SalePrice (Submission file)", fontsize=12)
plt.legend()

ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda y, pos: f'{int(y/1_000)}k'))

plt.show()
No description has been provided for this image
In [67]:
submission_df = pd.DataFrame({
    ID: houses_test[ID],
    TARGET: y_sub_pred
})

submission_df.to_csv('/kaggle/working/submission.csv', index=False)
print(f"Submission file 'submission.csv' created successfully!")

submission_df
Submission file 'submission.csv' created successfully!
Out[67]:
Id SalePrice
0 1461 122455.406250
1 1462 157231.781250
2 1463 178254.453125
3 1464 190086.000000
4 1465 187468.187500
... ... ...
1454 2915 81067.679688
1455 2916 78969.656250
1456 2917 160173.062500
1457 2918 118705.812500
1458 2919 222827.031250

1459 rows × 2 columns


💚 Thank you for reading 💚

If you have any questions or feedback, feel free to leave a comment 🤔

This notebook is still in progress.

Please UPVOTE if you enjoyed this notebook 🙏